/**********************************
name:		kayla freeman
date:		8/1/24
purpose:	prep tc sample, merging collected tc data with supp-cust links and compustat ctrls
**********************************/


clear
use "01_comp_basic_raw_vars" /*compustat download, need all years available for age calculation, up to 2017*/

*basic clean
drop if indfmt == "FS"
drop if missing(gvkey)
drop if missing(fyear)
rename fyear year
sort gvkey year at
bysort gvkey year: gen dup = cond(_N==1,0,_n)
tab dup 
drop if dup==2

/*make a tempfile with last year gvkey is in compustat*/
preserve
drop if missing(year)
bys gvkey year:egen lastcompy=max(year)
bys gvkey: keep if _n==1
keep gvkey lastcompy
tempfile lasty /*needed later for construction of relationship end*/
save `lasty'
restore

*gen age
sort gvkey year
by gvkey: gen year1=year[1]
gen age = year - year1
gen lnage = ln(age+1)

*hhi and mktsh
replace revt=0 if revt<0
gen sic3 = substr(sic,1,3)
hhi revt, by(sic3 year)
sort sic3 year
rename hhi_revt hhi

*calculate market share
bysort sic3 year: egen ind_rev=total(revt)
gen mkt_share = revt/ind_rev

*TC vars
gen ap_at = ap/at
gen ar_at = rect/at

gen ap_cogs = ap/cogs
gen ar_revt = rect/revt

*other ctrls
*size
gen lnat = ln(at)

*blev
replace dltt= 0 if missing(dltt)
replace dlc = 0 if missing(dlc)
replace dt = dltt+dlc if missing(dt)
gen blev = (dt)/at

*st debt
gen curdat = dlc/at

*q
gen q = (csho*prcc_f+at-ceq)/(at)

*tang
gen tang=ppent/at

*profit 
gen profit = oibdp/at

*xrd
replace xrd = 0 if missing(xrd)
gen rdat = xrd/at

*cash
gen cashat = che/at

/*leadrevt*/
sort gvkey year
by gvkey: gen leadrev = revt[_n+1] if year==year[_n+1]-1

egen id = group(gvkey)
gen sic2 = substr(sic, 1,2)
egen ind = group(sic2)

keep gvkey  datadate year ap at cogs rect  revt sic  lnage  hhi mkt_share ap_at ar_at ap_cogs ar_revt lnat q tang blev profit id sic2 ind year rdat curdat dt cashat leadrev

merge m:1 gvkey using `lasty'
	drop if _m==2
	drop _merge
tempfile inprog
save `inprog'

clear
use 02_supply_chain_links /*dataset of customer-supplier links from compustat segment disclosure; combined (cleaner) data from Edward Fee, Janet Gao, and Yixin Liu, plus wrds' version with customer gvkeys*/

merge 1:1 s_gvkey c_gvkey year using 00_rawTCdata
/*in the rawTCdata, variables:
	gvkeys of s and c
	csale, annual sale from s to c 
	na - code for availability of tc data in 10k: na, yes, vague, missing, or bad (bad are cases revealed during 10-k inspection to have been scraping errors from prior sources; i remove these below)
	ar_amount, cust rec balance in dollars if so reported
	ar_perc, cust rec balance in percent if so reported
	percnewcsale, correction (%) for error in data discovered in 10-k inspection
	newcsale, correction ($) for error in data discovered in 10-k inspection
	badmatch, indicator for an ob to be dropped because it was a false match from scraping error in og data
	*/

/*before anything else, fix the bad matches and csales revealed in data collection
*/
tab badmatch /*22 yes*/
tab na
drop if badmatch=="yes"
drop if na=="bad"
tab miss_csale /*21 1s*/
replace csale = . if miss_csale==1 /*some cases where csale was actually missing from 10k and incorrectly reported in og data*/
drop badmatch miss_csale

/*some filters*/
count if csale==0 
drop if csale==0
count if s_gvkey==c_gvkey 
drop if s_gvkey==c_gvkey


/*before dropping sc pairs without tc data, get year1 and numc/nums, lagcsale*/
sort s_gvkey year
bysort s_gvkey year: gen numcust = _N
sort c_gvkey year
bysort c_gvkey year: gen numsupp = _N
sort s_gvkey c_gvkey year
bysort s_gvkey c_gvkey: gen scyr1 = year[1]
bysort s_gvkey c_gvkey: gen lagcsale = csale[_n-1] if year == year[_n-1]+1
format s_gvkey c_gvkey %6s


drop if missing(csale)
bys s_gv c_gv: egen lastyr = max(year)

drop if _merge==1
drop _merge
ren conm s_name 
ren cconm c_name

ren s_gvkey gvkey
 merge m:1 gvkey year using `inprog'
	keep if _merge==3 
	drop _merge


foreach var of varlist gvkey  datadate lastcompy leadrev ap at cogs rect  revt sic  lnage  hhi mkt_share ap_at ar_at ap_cogs ar_revt lnat q tang blev profit id sic2 ind  dt rdat curdat cashat {
	ren `var' s_`var'
	}

label var s_lnage "S log age"
label var s_hhi "S HHI"
label var s_mkt_share "S market share"
label var s_ap_at "S AP/COGS"
label var s_ar_at "S AR/Sales"
label var s_lnat "S size"
label var s_q "S Q"
label var s_tang "S tangibility"
label var s_blev "S book lev."
label var s_profit "S profit"
label var s_sic2 "s_sic2"
label var s_rdat "S R&D intensity"
label var s_curdat "S curr debt/assets"
ren c_gvkey gvkey
merge m:1 gvkey year using `inprog' 
	keep if _merge==3
	drop _merge
	
foreach var of varlist gvkey  datadate  lastcompy leadrev ap at cogs rect  revt sic  lnage  hhi mkt_share ap_at ar_at ap_cogs ar_revt lnat q tang blev profit id sic2 ind   rdat curdat cashat {
	ren `var' c_`var'
	}
label var c_lnage "C log age"
label var c_hhi "C HHI"
label var c_mkt_share "C market share"
label var c_ap_at "C AP/COGS"
label var c_ar_at "C AR/Sales"
label var c_lnat "C size"
label var c_q "C Q"
label var c_tang "C tangibility"
label var c_blev "C book lev."
label var c_profit "C profit"
label var c_sic2 "c_sic2"
label var c_rdat "C R&D intensity"
label var c_curdat "C curr debt/assets"


/*fix csale*/
order csale newcsale percnewcsale,after(year)
	/*one of these is misplaced in perc; should be newcsale in dollars*/
	replace newcsale = percnewcsale if !missing(percnewcsale) & missing(newcsale) & percnewcsale>100000
	
gen Inewperc = !missing(percnewcsale)
gen Inewcsale =!missing(newcsale)
replace csale = newcsale/1000000 if Inewcsale==1 
replace csale = percnewcsale*s_revt if Inewperc==1 
count if Inewcsale ==1|Inewperc==1 

drop percnewcsale newcsale  Inewperc Inewcsale


sort s_gv c_gv year
/*cust sales gr*/
gen salesgr = .
by s_gv c_gv: replace salesgr = csale/csale[_n-1] -1 if _n>1 & year==year[_n-1]+1
by s_gv c_gv: gen leadsalesgr = csale[_n+1]/csale -1 if _n!=_N & year==year[_n+1]-1
by s_gv c_gv: gen leadcsale = csale[_n+1]

bys s_gvkey year: egen allcsale = total(csale)

gen cs_end = year==lastyr
	replace cs_end = . if year==s_lastcompy |year==c_lastcompy
	replace cs_end = . if year>2014 /*2017 is incomplete, so consider 2016 the end of the sample and require two years of no report to call it cs_end*/
	drop *lastcomp*
tempfile getleadconc
save `getleadconc'
	
replace year = year-1
bys s_gvkey year: gen lead_numcust = _N
bys s_gvkey year: egen lead_custconc = total(csale)
bys s_gvkey year: keep if _n==1
keep s_gvk year lead_custconc lead_numcust
merge 1:m s_gvk year using `getleadconc'
drop if _m==1
drop _m
replace lead_custconc = lead_custconc/s_leadrev
	ren s_leadrev s_leadrevt
gen cdate1 = mdy(month(c_datadate),1,year(c_datadate))
gen sdate1 = mdy(month(s_datadate),1,year(s_datadate))
tempfile almost
save `almost'

/*get ratings*/
clear
use 03_ratings /*ratings data*/
keep gvkey spltic datadate
*destring gvkey,replace
ren splti rating
replace datadate = mdy(month(datadate),1,year(datadate))
ren datadate sdate1
ren gvkey s_gvkey
tempfile rats
save `rats'
ren sdate1 cdate1
ren s_gvkey c_gvkey
ren rating c_rating
merge 1:m c_gvkey cdate1 using `almost'
drop if _merge==1
gen crated = _merge==3
replace crated = 0 if missing(c_rating)
drop _merge

merge m:1 s_gvkey sdate1 using `rats'
drop if _merge==2
gen srated = _merge==3
ren rating s_rating
replace srated = 0 if missing(s_rating)
drop _merge

save 04_prepped_data,replace /*this file includes the obs checked with no tc; needed for heckman tests,etc*/

